The following code is largely based on the Geodemographics & Data Reduction lab from the Urban Analytics text by Singleton (2017) It builds a geodemographic classification by in turn following the methodology used to create the UK Output Area Classification in a single city context: Liverpool. An interpretation of the final cluster map can be found in bold below.
We first load a series of tables: “Census_2011_Count_All” is a data frame containing census data for the UK, which are detailed in “Variable_Desc”. The “Liverpool” data frame is a list of output areas within Liverpool, UK, while “OAC_Input_Lookup” is a lookup for the variables used to create OAC.
# Load data
load("./data/census_2011_UK_OA.RData")
We now create a subset of the input data for Liverpool by merging the data frame that contain output areas as well as census data:
#Crop
Census_2011_Count <- merge(Liverpool,Census_2011_Count_All,by="OA",all.x=TRUE)
The first stage in the analysis is to create aggregations for the input data. The following code calculates all of the numerators:
OAC_Input <- as.data.frame(Census_2011_Count$OA)
colnames(OAC_Input) <- "OA"
# Loops through each row in the OAC input table
for (n in 1:nrow(OAC_Input_Lookup)){
# Gets the variables to aggregate for the row specified by n
select_vars <- OAC_Input_Lookup[n,"England_Wales"]
# Creates a list of the variables to select
select_vars <- unlist(strsplit(paste(select_vars),","))
# Creates variable name
vname <- OAC_Input_Lookup[n,"VariableCode"]
# Creates a sum of the census variables for each Output Area
tmp <- data.frame(rowSums(Census_2011_Count[,select_vars, drop=FALSE]))
colnames(tmp) <- vname
# Appends new variable to the OAC_Input object
OAC_Input <- cbind(OAC_Input,tmp)
# Removes temporary objects
remove(list = c("vname","tmp"))
} # END: Loop through each row in the OAC input table
# The variables for k035 were included but this is merely for coding simplicity above, as the Standardized Illness Ratio is calculated later; as such we will remove this from the numerator data frame
OAC_Input$k035 <- NULL
We now create another data frame which contains the denominators…
OAC_Input_den <- as.data.frame(Census_2011_Count$OA)
colnames(OAC_Input_den) <- "OA"
# Create a list of unique denominators
den_list <- unique(OAC_Input_Lookup[,"Denominator"])
den_list <- paste(den_list[den_list != ""])
# Select denominators
OAC_Input_den <- Census_2011_Count[,c("OA",den_list)]
… so as to then merge it with the numerators:
#Merge
OAC_Input <- merge(OAC_Input,OAC_Input_den, by="OA")
To match numerators and denominators and calculate percerntages we use a loop that runs through a list of variable names created as follows:
# Get numerator denominator list where the Type is "Count" - i.e. not ratio
K_Var <- OAC_Input_Lookup[OAC_Input_Lookup$Type == "Count",c(1,3)]
# Creates an OA list / data frame
OAC_Input_PCT_RATIO <- subset(OAC_Input, select = "OA")
# Loop
for (n in 1:nrow(K_Var)){
num <- paste(K_Var[n,"VariableCode"]) # Gets numerator name
den <- paste(K_Var[n,"Denominator"]) # Gets denominator name
tmp <- data.frame(OAC_Input[,num] / OAC_Input[,den] * 100) # Calculates percentages
colnames(tmp) <- num
OAC_Input_PCT_RATIO <- cbind(OAC_Input_PCT_RATIO,tmp) # Appends the percentages
# Remove temporary objects
remove(list = c("tmp","num","den"))
}
The final two variables include density (k007), which can be joined from the original census table:
#Extract Variable
tmp <- Census_2011_Count[,c("OA","KS101EW0008")]
colnames(tmp) <- c("OA","k007")
#Merge
OAC_Input_PCT_RATIO <- merge(OAC_Input_PCT_RATIO,tmp,by="OA")
We now calculate the variable k035, which was the standardized illness rate (SIR) and in turn needs to be calculated for each subset of the national data, in this case Liverpool:
# Calculates rates of ill people 15 or less and greater than or equal to 65
ill_16_64 <- rowSums(Census_2011_Count[,c("KS301EW0005","KS301EW0006")]) # Ill people 16-64
ill_total <- rowSums(Census_2011_Count[,c("KS301EW0002","KS301EW0003")]) # All ill people
ill_L15_G65 <- ill_total - ill_16_64 # Ill people 15 or less and greater than or equal to 65
# Calculates total people 15 or less and greater than or equal to 65
t_pop_16_64 <- rowSums(Census_2011_Count[,c("KS102EW0007","KS102EW0008","KS102EW0009","KS102EW0010","KS102EW0011","KS102EW0012","KS102EW0013")]) # People 16-64
t_pop <- Census_2011_Count$KS101EW0001 # All people
t_pop_L15_G65 <- t_pop - t_pop_16_64 # All people 15 or less and greater than or equal to 65
# Calculates expected rate
ex_ill_16_64 <- t_pop_16_64 * (sum(ill_16_64)/sum(t_pop_16_64)) # Expected ill 16-64
ex_ill_L15_G65 <- t_pop_L15_G65 * (sum(ill_L15_G65)/sum(t_pop_L15_G65)) # Expected ill people 15 or less and greater than or equal to 65
ex_ill <- ex_ill_16_64 + ex_ill_L15_G65 # total expected ill people
# Ratio
SIR <- as.data.frame(ill_total / ex_ill * 100) # ratio between ill people and expected ill people
colnames(SIR) <- "k035"
# Merges data
OAC_Input_PCT_RATIO <- cbind(OAC_Input_PCT_RATIO,SIR)
# Removes unwanted objects
remove(list=c("SIR","ill_16_64","ill_total","ill_L15_G65","t_pop_16_64","t_pop","t_pop_L15_G65","ex_ill_16_64","ex_ill_L15_G65","ex_ill"))
We now apply the two standardization and normalization procedures to the input data, namely inverse hyperbolic sine and range standardization.
# Calculates inverse hyperbolic sine
OAC_Input_PCT_RATIO_IHS <- log(OAC_Input_PCT_RATIO[,2:61]+sqrt(OAC_Input_PCT_RATIO[,2:61]^2+1))
# Calculates range
range_01 <- function(x){(x-min(x))/(max(x)-min(x))} # range function
OAC_Input_PCT_RATIO_IHS_01 <- apply(OAC_Input_PCT_RATIO_IHS, 2, range_01) # applies range function to columns
# Adds the OA codes back onto the data frame as row names
rownames(OAC_Input_PCT_RATIO_IHS_01) <- OAC_Input_PCT_RATIO$OA
The following is quoted directly from the Lab:
"We have now created a subset of 1584 Output Areas for the extent of Liverpool with inputs that have mirrored the attributes, measures, transformation and standardization methods used for the UK OAC 2011 classification. Prior to clustering this bespoke Liverpool classification, it is worth considering what would be an appropriate number of clusters for the initial Super Group (most aggregate) level.
This can be considered by running some test cluster analysis with different cluster frequency (k), and for each result, examining a statistic called the total within sum of squares. This is a measure of how well the cluster frequency fits the data - essentially the mean standardized distance of the data observations to a cluster mean. These tests are typically plotted, with the purpose to identify an “elbow criterion” which is a visual indication of where an appropriate cluster frequency might be set. The trade off you are looking for is the impact of increasing cluster frequency (i.e. making a more complex classification) versus a reduction in this score."
library(ggplot2)
# Creates a new empty numeric object to store the wss results
wss <- numeric()
# Runs k means for 2-12 clusters and store the wss results
for (i in 2:12) wss[i] <- sum(kmeans(OAC_Input_PCT_RATIO_IHS_01, centers=i,nstart=20)$withinss)
# Creates a data frame with the results, adding a further column for the cluster number
wss <- data.frame(2:12,wss[-1])
# Plots the results
names(wss) <- c("k","Twss")
ggplot(data=wss, aes(x= k, y=Twss)) + geom_path() + geom_point() + scale_x_continuous(breaks=2:12) + labs(y = "Total within sum of squares")
Since there are no large decreases in the within sum of squares and there is a minor moderation of the decrease around 7 or 8 clusters, a 7 cluster solution was chosen.
Now that we have chosen seven clusters, we now consider how the partitioning can be optimized.
# Load cluster object
load("./data/cluster_7.Rdata")
The cluster results can be accessed as follows:
# Lookup Table
lookup <- data.frame(cluster_7$cluster)
# Add OA codes
lookup$OA <- rownames(lookup)
colnames(lookup) <- c("K_7","OA")
# Recode clusters as letter
lookup$SUPER <- LETTERS[lookup$K_7]
It is also important to look at the distribution of these clusters:
table(lookup$K_7)
##
## 1 2 3 4 5 6 7
## 259 340 279 334 109 73 190
Here, a map of the clusters is produced. Note the changes from the lab, which include the title, the type of palette, and the border line color
# Load packages
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.5-8, (SVN revision 990)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/rgdal/proj
## Linking to sp version:1.4-2
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
library(tmap)
library(rgeos)
## rgeos version: 0.5-3, (SVN revision 634)
## GEOS runtime version: 3.7.2-CAPI-1.11.2
## Linking to sp version: 1.4-1
## Polygon checking: TRUE
# Import OA boundaries (taking out "OGRGeoJSON")
liverpool_SP <- readOGR("./data/Liverpool_OA_2011.geojson")
## OGR data source with driver: GeoJSON
## Source: "/Users/Marcos/Downloads/Geographic Information Science III/Lab 6/urban_analytics/10_Data_Reduction_Geodemographics/data/Liverpool_OA_2011.geojson", layer: "Liverpool_OA_2011"
## with 1584 features
## It has 1 fields
# Merge lookup
liverpool_SP <- merge(liverpool_SP, lookup, by.x="oa_code",by.y="OA")
m <- tm_shape(liverpool_SP, projection=27700) +
tm_polygons(col="SUPER", border.col = "grey30", palette="Pastel1",border.alpha = .3, title="Clusters of the city of Liverpool", showNA=FALSE) +
tm_layout(legend.position = c("left", "bottom"), frame = FALSE)
#Create leaflet plot
tmap_leaflet(m)
## Warning: The shape liverpool_SP is invalid. See sf::st_is_valid
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
## legend.postion is used for plot mode. Use view.legend.position in tm_view to set the legend position in view mode.